library(tidyverse)
library(magrittr)
library(pheatmap)
counts <- read.delim(paste0(dir, "Data/count_data.csv"), sep = ",")
mean_counts <- read.delim(paste0(dir, "Data/mean_counts.csv"), sep = ",") %>%
group_by(gene, age_group) %>%
summarize(counts = mean(counts))
## `summarise()` has grouped output by 'gene'. You can override using the
## `.groups` argument.
head(mean_counts)
## # A tibble: 6 × 3
## # Groups: gene [2]
## gene age_group counts
## <chr> <chr> <dbl>
## 1 A1BG 1-15 143.
## 2 A1BG 16-26 130.
## 3 A1BG 27-60 111.
## 4 A1BG 61-85 134.
## 5 A1BG 86-96 45.5
## 6 A2M 1-15 465.
TF_targets <- read.delim(paste0(dir, "Data/tf-target-information.txt")) %>%
subset(select = c("TF", "target")) %>%
distinct() %>%
drop_na()
head(TF_targets)
## TF target
## 1 AEBP2 TMEM53
## 2 AEBP2 C1orf228
## 3 AEBP2 FBXO31
## 4 AEBP2 ADAMTSL5
## 5 AEBP2 CTB-25B13.9
## 6 AEBP2 MIR6791
p_0.05 <- read.delim(paste0(dir, "Data/DE_var_p_n_200.csv"), sep = ",")
head(p_0.05)
## gene transition abs_log2_fc
## 1 A2M fc_1-15_16-26 2.0837541
## 2 A2M fc_16-26_27-60 2.1870559
## 3 AARD fc_1-15_16-26 1.7421494
## 4 ABCA3 fc_27-60_61-85 0.8601670
## 5 ABCA5 fc_16-26_27-60 0.6800931
## 6 ABCA5 fc_61-85_86-96 1.8283348
# data frame to map to the following transition
transitions <- data.frame(TF_transition = c("fc_1-15_16-26", "fc_16-26_27-60", "fc_27-60_61-85"),
target_transition = c("fc_16-26_27-60", "fc_27-60_61-85", "fc_61-85_86-96"))
# select DE TFs and their DE targets in the following transition
DE_TFs_targets <- filter(p_0.05, gene %in% TF_targets$TF) %>%
filter(transition != "fc_61-85_86-96") %>%
dplyr::rename(TF = gene, TF_transition = transition, TF_fc = abs_log2_fc) %>%
left_join(transitions) %>%
left_join(TF_targets) %>%
left_join(dplyr::rename(p_0.05, target = gene, target_transition = transition)) %>%
drop_na(abs_log2_fc)
## Joining, by = "TF_transition"
## Joining, by = "TF"
## Joining, by = c("target_transition", "target")
head(DE_TFs_targets)
## TF TF_transition TF_fc target_transition target abs_log2_fc
## 1 ERG fc_1-15_16-26 1.899511 fc_16-26_27-60 MOAP1 22.5642379
## 2 ERG fc_1-15_16-26 1.899511 fc_16-26_27-60 WNT2B 1.1286318
## 3 ERG fc_1-15_16-26 1.899511 fc_16-26_27-60 RRN3 23.9218949
## 4 ERG fc_1-15_16-26 1.899511 fc_16-26_27-60 NIPAL3 0.4870438
## 5 ERG fc_1-15_16-26 1.899511 fc_16-26_27-60 COL16A1 0.5443583
## 6 ERG fc_1-15_16-26 1.899511 fc_16-26_27-60 ZNF697 0.5524617
group_by(DE_TFs_targets, TF) %>% summarize(n_DE_targets = n()) %>%
arrange(desc(n_DE_targets))
## # A tibble: 13 × 2
## TF n_DE_targets
## <chr> <int>
## 1 MYH11 99
## 2 ERG 86
## 3 FOS 58
## 4 TFAP2C 55
## 5 MECOM 24
## 6 PGR 20
## 7 RUNX2 15
## 8 MAFK 9
## 9 FOXP2 6
## 10 TCF7L2 3
## 11 MEIS1 2
## 12 HOXA13 1
## 13 NR5A2 1
expr_DE_targets <- left_join(DE_TFs_targets, dplyr::rename(mean_counts, target = gene)) %>%
filter(TF == "FOS") %>%
subset(select = c(target, age_group, counts)) %>%
#drop_na() %>%
spread(key = age_group, value = counts)
## Joining, by = "target"
head(expr_DE_targets)
## target 1-15 16-26 27-60 61-85 86-96
## 1 ACTN4 8659.1706 8282.4531 7337.9509 8586.8436 5003.89874
## 2 ANLN 5757.1681 6237.0074 6537.9018 5922.7134 932.15686
## 3 BUB1 1233.4397 1263.9336 1299.8837 1117.4248 220.35056
## 4 C1orf159 145.7017 128.9521 119.3703 139.6103 42.86685
## 5 C21orf58 240.3914 208.6302 205.2445 187.6005 54.64397
## 6 C7orf50 926.5183 825.3168 735.9493 816.6335 263.22009
# Create matrix
hclust_matrix <- select(expr_DE_targets, -target) %>%
as.matrix()
rownames(hclust_matrix) <- expr_DE_targets$target
# Z-transform values for each gene
hclust_matrix %<>% t() %>% scale() %>% t()
# Calculate euclidean distances between genes
gene_dist <- dist(hclust_matrix)
# Clustering
gene_hclust <- hclust(gene_dist, method = "ward.D2")
# Plot dendrogram
plot(gene_hclust, labels = FALSE)
gene_cluster <- cutree(gene_hclust, k = 4) %>%
enframe() %>%
dplyr::rename(gene = name, cluster = value) %>%
left_join(group_by(mean_counts, gene) %>% mutate(counts = scale(counts)))
## Joining, by = "gene"
ggplot(gene_cluster, aes(x = age_group, y = counts, group = gene)) +
geom_line(alpha = 0.3) +
facet_wrap(~cluster, nrow = 1)+
geom_line(stat = "summary", fun = "median", colour = "brown", size = 1.2, aes(group = 1)) +
theme_bw()
results <- subset(gene_cluster, select = c(gene, cluster)) %>% distinct()
write.csv(results, paste0(dir, "Data/cluster_FOS_targets.csv"), row.names = FALSE)
# Read in difference map for FOS DE targets
diff <- read.delim(paste0(dir, "Data/difference_maps/diff_FOS_DE_targets.csv"), sep = ",") %>%
gather(key = 'chr2', value = 'difference', -chr1)
# Number of differences
diff_counts <- group_by(diff, chr1, difference) %>% tally() %>% filter(difference != 0) %>%
left_join(data.frame("difference" = c(1, -1), "group" = c('young-specific', 'old-specific'))) %>%
dplyr::rename('locus' = 'chr1') %>%
subset(select = c(locus, n, group)) %>%
filter(n >= 7)
## Joining, by = "difference"
head(diff_counts)
## # A tibble: 6 × 3
## # Groups: locus [6]
## locus n group
## <chr> <int> <chr>
## 1 chr_1_loc_1000000 7 young-specific
## 2 chr_1_loc_1250000 7 young-specific
## 3 chr_11_loc_65000000 7 young-specific
## 4 chr_17_loc_78000000 7 old-specific
## 5 chr_20_loc_33500000 8 young-specific
## 6 chr_21_loc_46250000 7 young-specific
# Map loci to genes
loci <- unique(diff$chr1)
gene_loci <- read.delim(paste0(dir, "Data/all_gene_loci.csv"), sep = ",") %>%
filter(gene %in% expr_DE_targets$target) %>%
left_join(diff_counts) %>%
mutate(group = replace_na(group, "no_differences"))
## Joining, by = "locus"
# Add expression values
expr_targets <- left_join(gene_loci, mean_counts) %>%
subset(select = c(gene, age_group, counts, group)) %>%
group_by(gene) %>%
mutate(counts = scale(counts))
## Joining, by = "gene"
ggplot(expr_targets, aes(x = age_group, y = counts, group = gene)) +
geom_line(alpha = 0.3) +
facet_wrap(~group, nrow = 1)+
geom_line(stat = "summary", fun = "median", colour = "brown", size = 1.2, aes(group = 1)) +
theme_bw()
# Read in difference map for last PCST loci and Louvain clusters
diff <- read.delim(paste0(dir, "Data/difference_maps/diff_last_PCST_loci.csv"), sep = ",") %>%
gather(key = 'chr2', value = 'difference', -chr1)
clusters <- read.delim(paste0(dir, 'Data/difference_maps/louvain_clusters_last_PCST.csv'), sep = ",")
#clusters %<>% mutate(cluster = sample(clusters$cluster, nrow(clusters)))
# Number of differences
diff_counts <- group_by(diff, chr1, difference) %>% tally() %>% filter(difference != 0) %>%
left_join(data.frame("difference" = c(1, -1), "group" = c('young-specific', 'old-specific'))) %>%
dplyr::rename('locus' = 'chr1') %>%
subset(select = c(locus, n, group)) %>%
filter(n >= 40)
## Joining, by = "difference"
# Add number of differences to the annotation df
gene_loci <- clusters %>%
left_join(diff_counts) %>%
mutate(group = replace_na(group, "no_differences"))
## Joining, by = "locus"
# Add expression values
expr_targets <- left_join(gene_loci, mean_counts) %>%
subset(select = c(gene, locus, group, cluster, age_group, counts)) %>%
group_by(gene) %>%
mutate(counts = scale(counts))
## Joining, by = "gene"
ggplot(expr_targets, aes(x = age_group, y = counts, group = gene)) +
geom_line(alpha = 0.3) +
facet_wrap(~group, nrow = 1)+
geom_line(stat = "summary", fun = "median", colour = "brown", size = 1.2, aes(group = 1)) +
theme_bw()
ggplot(expr_targets, aes(x = age_group, y = counts, group = gene)) +
geom_line(alpha = 0.3) +
facet_wrap(~cluster, nrow = 2)+
geom_line(stat = "summary", fun = "median", colour = "brown", size = 1.2, aes(group = 1)) +
theme_bw()+
theme(text = element_text(size = 15))
# Number of differences
diff_counts <- group_by(diff, chr1, difference) %>% tally() %>% filter(difference != 0) %>%
left_join(data.frame("difference" = c(1, -1), "group" = c('young-specific', 'old-specific'))) %>%
dplyr::rename('locus' = 'chr1') %>%
subset(select = c(locus, n, group))
## Joining, by = "difference"
# Get top 10 loci per cluster (with most differences)
selection <- clusters %>%
left_join(diff_counts) %>%
group_by(cluster) %>%
slice_max(n, n = 10, with_ties = FALSE)
## Joining, by = "locus"
# Expression patterns only for the top 10 loci per cluster
expr_targets_filt <- left_join(selection, mean_counts) %>%
subset(select = c(gene, locus, group, cluster, age_group, counts)) %>%
group_by(gene) %>%
mutate(counts = scale(counts))
## Joining, by = "gene"
ggplot(expr_targets_filt, aes(x = age_group, y = counts, group = gene)) +
geom_line(alpha = 0.3) +
facet_wrap(~cluster)+
geom_line(stat = "summary", fun = "median", colour = "brown", size = 1.2, aes(group = 1)) +
theme_bw()+
theme(text = element_text(size = 15))
clusters_diff <- group_by(diff, chr1, difference) %>% tally() %>% filter(difference != 0) %>%
left_join(data.frame("difference" = c(1, -1), "group" = c('young-specific', 'old-specific'))) %>%
dplyr::rename('locus' = 'chr1') %>%
left_join(clusters) %>%
mutate(group = factor(group, levels = c('young-specific', 'old-specific'))) %>%
group_by(group, cluster) %>%
mutate(mean = mean(n))
## Joining, by = "difference"
## Joining, by = "locus"
ggplot(clusters_diff, aes(x = n)) +
geom_histogram() +
facet_grid(rows = vars(group), cols = vars(cluster), scale = "free_x") +
geom_vline(aes(xintercept = mean, group = cluster), colour = 'red') +
theme_bw() +
xlab('number of differences') +
ylab('number of loci')+
theme(text = element_text(size = 15))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
clusters_diff_wide <- subset(clusters_diff, select = c(locus, gene, cluster, group, n)) %>%
spread(key = group, value = n, fill = 0) %>%
mutate(cluster = factor(cluster, levels = seq(0, 5, 1)))
colnames(clusters_diff_wide) <- c('locus', 'gene', 'cluster', 'young_specific', 'old_specific')
ggplot(clusters_diff_wide, aes(x = young_specific, y = old_specific, color = cluster)) +
geom_point() +
xlab('young-specific interactions per locus') +
ylab('old-specific interactions per locus') +
facet_wrap(~cluster) +
theme_bw()+
theme(text = element_text(size = 17)) +
geom_vline(xintercept = 10) +
geom_hline(yintercept = 10)
switching_loci <- filter(clusters_diff_wide, young_specific > 10, old_specific > 10)
# Create matrix
hclust_matrix <- select(ungroup(expr_targets), -c(group, locus, cluster)) %>%
spread(key = age_group, value = counts) %>%
column_to_rownames('gene') %>%
as.matrix()
# Calculate euclidean distances between genes
gene_dist <- dist(hclust_matrix)
# Clustering
gene_hclust <- hclust(gene_dist, method = "ward.D2")
# Plot dendrogram
plot(gene_hclust, labels = FALSE)
gene_cluster <- cutree(gene_hclust, k = 6) %>%
enframe() %>%
dplyr::rename(gene = name, cluster = value) %>%
left_join(group_by(mean_counts, gene) %>% mutate(counts = scale(counts)))
## Joining, by = "gene"
ggplot(gene_cluster, aes(x = age_group, y = counts, group = gene)) +
geom_line(alpha = 0.3) +
facet_wrap(~cluster, nrow = 2)+
geom_line(stat = "summary", fun = "median", colour = "brown", size = 1.2, aes(group = 1)) +
theme_bw()+
theme(text = element_text(size = 15))
targets_per_cluster <- left_join(clusters, dplyr::rename(TF_targets, 'gene' = 'target')) %>%
group_by(TF, cluster) %>%
summarize(number_of_targets = n()) %>%
drop_na()
## Joining, by = "gene"
## `summarise()` has grouped output by 'TF'. You can override using the `.groups`
## argument.
head(targets_per_cluster)
## # A tibble: 6 × 3
## # Groups: TF [3]
## TF cluster number_of_targets
## <chr> <int> <int>
## 1 AFF4 0 1
## 2 AHR 0 3
## 3 AHR 1 2
## 4 AHR 2 2
## 5 AHR 3 1
## 6 AR 0 15
# for normalization: number of genes per cluster and percentages that each TF targets
genes_per_cluster <- group_by(clusters, cluster) %>% summarize(n = n())
# percentage of targets per TF
percentages <- targets_per_cluster %>%
group_by(TF) %>%
summarize(number_of_targets = sum(number_of_targets)) %>%
mutate(percentage_targets = number_of_targets / length(clusters$gene))
wide <- spread(targets_per_cluster, key = cluster, value = number_of_targets, fill = 0) %>%
column_to_rownames('TF')
colnames(wide) <- c('cluster_0', 'cluster_1', 'cluster_2', 'cluster_3', 'cluster_4', 'cluster_5')
# enrichment score: percentage of genes targeted in the cluster / percentages of genes targeted overall (in the PCST genes)
wide <- t(t(wide)/genes_per_cluster$n)
#wide <- wide / percentages$percentage_targets
#pheatmap(wide, show_rownames = F, clustering_method = 'ward.D2',
# color = colorRampPalette(brewer.pal(n = 7, name = "YlOrRd"))(100))
pheatmap(wide, show_rownames = F, clustering_method = 'ward.D2', scale = "row")
–> There are groups of TFs that only target genes in one cluster specifically. Do these TF groups overlap with the ones we identified in our PCST analysis?
# load info about which TFs occurred in which of the three network
pcst_TFs = read.delim(paste0(dir, 'Data/pcst/incl_TFs_design2.csv'), sep = ",") %>%
mutate(included = "True") %>%
mutate(net = factor(net, levels = c('young_net', 'middle_net', 'old_net'))) %>% #keeps ordering
spread(key = net, value = included, fill = "False") %>%
right_join(data.frame('TF' = rownames(wide))) %>%
replace(is.na(.), 'False') %>%
column_to_rownames('TF')
## Joining, by = "TF"
head(pcst_TFs)
## young_net middle_net old_net
## AHR True True True
## AR False True False
## ARNT False False True
## ASCL1 True True False
## ASCL2 False True True
## ATF1 True True True
# add annotation bars according to PCST
annoCol<-list(young_net=c(True="blue", False="grey"),
middle_net = c(True = 'red', False = 'grey'),
old_net = c(True = 'green', False = 'grey'))
pheatmap(wide, show_rownames = F, clustering_method = 'average',
annotation_row = pcst_TFs, annotation_colors = annoCol, scale = "row")